export CUDA_VISIBLE_DEVICES=1

# 1024*1024 bz 1 <=31GB   
# 1024*2048 bz 1 43.40GB  
#           1.273148 day = 10k 
#           0.6 day = 5k
# 训练计划  第一种  embedding 相加
#          第二种  concat => embder => embedding
#          第三种  embeding attn

accelerate launch \
--config_file accelerate_one_config.yaml \
train_flux_depth_local2.py \
--jsonl_for_train local_data_20250711.json \
--cache_dir /mnt/nas/shengjie/cache/ \
--image_column ori_filepath \
--resolution_height 1024 \
--resolution_width 2048 \
--output_dir /mnt/nas/shengjie/depth_local_output_20250714/ \
--logging_dir logs \
--mixed_precision bf16 \
--pretrained_model_name_or_path /home/shengjie/ckp/FLUX.1-Fill-dev \
--train_batch_size 1 \
--dataloader_num_workers 4 \
--max_train_steps 100000 \
--checkpointing_steps 1000 \
--rank 128 \
--gradient_checkpointing \
--use_8bit_adam \
--learning_rate 1e-4 \
--lr_warmup_steps 0
# --offload 
# --quant_transformers